banner

Predict the popularity of board games on BoardGameGeek.com

Kaggle link

Twitch link

Load libraries

library(tidyverse)
library(tidymodels)
library(scales)
library(skimr)
library(kableExtra)

# colours
#pal_jo <- c(viridisLite::magma(8)[2:8], "#4C4C53", "#9B9BA8")

pal_sliced <- c(
  '#e946a5', # pink
  '#68e199', # green
  '#302df0', # blue
  '#6a45b0', # purple
  '#129875', # darker green
  '#4f94bf', # lighter blue
  '#b545ab', # darker pink
  '#000000', # black,
  '#cccccc'  #grey
)

title_colour <- pal_sliced[1]
table_colour <- pal_sliced[4]
bar_colour <- pal_sliced[2]
grid_colour <- pal_sliced[9]

theme_set(theme_bw() %+replace%
    theme(
      # align title and caption to the plot not the panel
      plot.title.position = 'plot',
      plot.caption.position = 'plot',
      # change the title and caption to markdown and move them futher from the plot
      plot.title = element_text(
        size = rel(1.3),
        hjust = 0, 
        margin = margin(c(0, 0, 10, 0)),
        colour = title_colour
      ),
      plot.subtitle = element_text(
        size = rel(1.15),
        hjust = 0, 
        margin = margin(c(0, 0, 15, 0))
      ),
      plot.caption = element_text(
        hjust = 1, 
        margin = margin(c(10, 0, 0, 0))
      ),
      # move axis titles to the left/top and change them to markdown
      axis.title = element_text(hjust = 1),
      # allow the axis values to the markdown as well
      axis.text = element_text(),
      # remove the panel border
      panel.border = element_blank(),
      # put in the axis lines with a slightly thicker line than the gridlines
      axis.line = element_line(colour = grid_colour, size = rel(1.5)),
      # make the tickmarks the same colour
      axis.ticks = element_line(colour = grid_colour),
      # facet strip text left aligned with extra space above
      strip.text = element_text(
        hjust = 0, margin = margin(c(10, 0, 0, 0)), colour = title_colour
      ),
      # clear colour and fill for strip
      strip.background = element_rect(colour = NA, fill = NA),
      # dotted gridlines
      panel.grid = element_line(linetype = 'dotted'),
      # ability to use a different colour for the gridlines
      panel.grid.major.x = element_line(colour = grid_colour),
      panel.grid.major.y = element_line(colour = grid_colour),
      panel.grid.minor.x = element_blank(),
      panel.grid.minor.y = element_blank(),
    )
)

scale_y_pct <- function(
  accuracy = 1L, 
  breaks = pretty_breaks(),
  expand = expansion(mult = c(0, .05)),
  ...
) {
  scale_y_continuous(
    labels = scales::percent_format(accuracy = accuracy, big.mark = ","),
    breaks = breaks,
    expand = expand,
    ...
  )
}

scale_y_comma <- function(
  accuracy = 1L, 
  breaks = pretty_breaks(),
  expand = expansion(mult = c(0, .05)),
  ...
) {
  scale_y_continuous(
    labels = scales::comma_format(accuracy = accuracy),
    breaks = breaks,
    expand = expand,
    ...
  )
}

scale_fill_jo <- function(...) {
  scale_fill_manual(values = pal_sliced, ...)
}

scale_fill_discrete <- scale_fill_jo

update_geom_defaults("bar", list(fill = bar_colour))
update_geom_defaults("col", list(fill = bar_colour))
update_geom_defaults("point", list(colour = bar_colour))
update_geom_defaults("line", list(colour = bar_colour))

Read files


── Column specification ────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  game_id = col_double(),
  min_players = col_double(),
  max_players = col_double(),
  avg_time = col_double(),
  min_time = col_double(),
  max_time = col_double(),
  year = col_double(),
  geek_rating = col_double(),
  num_votes = col_double(),
  age = col_double(),
  owned = col_double()
)
ℹ Use `spec()` for the full column specifications.

── Column specification ────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  game_id = col_double(),
  min_players = col_double(),
  max_players = col_double(),
  avg_time = col_double(),
  min_time = col_double(),
  max_time = col_double(),
  year = col_double(),
  num_votes = col_double(),
  age = col_double(),
  owned = col_double(),
  category11 = col_logical(),
  category12 = col_logical()
)
ℹ Use `spec()` for the full column specifications.

Examine data

Data summary
Name to_build
Number of rows 3499
Number of columns 26
_______________________
Column type frequency:
character 15
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
names 0 1.00 1 82 0 3485 0
mechanic 0 1.00 4 258 0 1758 0
category1 0 1.00 4 26 0 78 0
category2 611 0.83 4 26 0 81 0
category3 1773 0.49 4 26 0 72 0
category4 2636 0.25 4 25 0 62 0
category5 3098 0.11 4 25 0 45 0
category6 3363 0.04 4 25 0 38 0
category7 3453 0.01 4 25 0 21 0
category8 3480 0.01 5 19 0 14 0
category9 3494 0.00 7 19 0 5 0
category10 3495 0.00 6 25 0 3 0
category11 3498 0.00 9 9 0 1 0
category12 3498 0.00 15 15 0 1 0
designer 0 1.00 4 157 0 1905 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
game_id 0 1 89632.37 77040.84 2.00 11164.50 73538.0 160677.0 244522.0 ▇▂▃▃▂
min_players 0 1 2.01 0.67 0.00 2.00 2.0 2.0 8.0 ▂▇▁▁▁
max_players 0 1 5.06 7.24 0.00 4.00 4.0 6.0 200.0 ▇▁▁▁▁
avg_time 0 1 117.24 487.78 0.00 30.00 60.0 120.0 22500.0 ▇▁▁▁▁
min_time 0 1 82.53 214.19 0.00 30.00 45.0 90.0 5400.0 ▇▁▁▁▁
max_time 0 1 116.80 487.84 0.00 30.00 60.0 120.0 22500.0 ▇▁▁▁▁
year 0 1 1996.10 161.95 -3000.00 2003.00 2011.0 2015.0 2018.0 ▁▁▁▁▇
geek_rating 0 1 6.09 0.48 5.64 5.73 5.9 6.3 8.5 ▇▂▁▁▁
num_votes 0 1 2006.03 4644.62 62.00 281.00 618.0 1640.0 77423.0 ▇▁▁▁▁
age 0 1 10.43 3.22 0.00 8.00 11.0 12.0 42.0 ▃▇▁▁▁
owned 0 1 3054.96 6369.31 49.00 622.50 1204.0 2723.0 111807.0 ▇▁▁▁▁

Year

  • if zero then impute
  • otherwise if < 1900 then set to 1900
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -3000    2003    2011    1996    2015    2018 

names year
Backgammon -3000
Go -2200
year n
0 10
400 1
550 1
700 1
1000 1
1425 1
1600 2
1663 1
1701 1
1715 1
1780 1
1800 2
1810 1
1848 1
1850 1
1870 1
1876 1
1883 1
1895 1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1904    2004    2011    2007    2015    2018 

Time

  • if zero reset to missing and impute
  • if > 240 then set to 240
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    30.0    60.0   117.2   120.0 22500.0 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   30.00   45.00   82.53   90.00 5400.00 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    30.0    60.0   116.8   120.0 22500.0 

Create recipe

rec <- recipe(geek_rating ~ ., data = train) %>%
  # id role for game_id
  update_role(game_id, new_role = "ID") %>%
  # flags for categories
  step_mutate(
    
    cat_card_game = if_any(starts_with("cate"), ~ .x == "Card Game"),
    cat_card_game =as.numeric( replace_na(cat_card_game, FALSE)),
    
    cat_wargame = if_any(starts_with("cate"), ~ .x == "Wargame"),
    cat_wargame = as.numeric(replace_na(cat_wargame, FALSE)),
    
    cat_fantasy = if_any(starts_with("cate"), ~ .x == "Fantasy"),
    cat_fantasy = as.numeric(replace_na(cat_fantasy, FALSE)),
    
    cat_economic = if_any(starts_with("cate"), ~ .x == "Economic"),
    cat_economic = as.numeric(replace_na(cat_economic, FALSE)),
    
    cat_fighting = if_any(starts_with("cate"), ~ .x == "Fighting"),
    cat_fighting = as.numeric(replace_na(cat_fighting, FALSE)),
    
    cat_science_fiction = if_any(starts_with("cate"), ~ .x == 
      "Science Fiction"),
    cat_science_fiction = as.numeric(replace_na(cat_science_fiction, FALSE)),
    
    cat_dice = if_any(starts_with("cate"), ~ .x == "Dice"),
    cat_dice = as.numeric(replace_na(cat_dice, FALSE)),
    
    cat_bluffing = if_any(starts_with("cate"), ~ .x == "Bluffing"),
    cat_bluffing = as.numeric(replace_na(cat_bluffing, FALSE)),
    
    cat_adventure = if_any(starts_with("cate"), ~ .x == "Adventure"),
    cat_adventure = as.numeric(replace_na(cat_adventure, FALSE)),
    
    cat_miniatures = if_any(starts_with("cate"), ~ .x == "Miniatures"),
    cat_miniatures = as.numeric(replace_na(cat_miniatures, FALSE)),
    
    cat_ww2 = if_any(starts_with("cate"), ~ .x == "World War II"),
    cat_ww2 = as.numeric(replace_na(cat_ww2, FALSE)),
    
    cat_medieval = if_any(starts_with("cate"), ~ .x == "Medieval"),
    cat_medieval = as.numeric(replace_na(cat_medieval, FALSE)),
    
    cat_explor = if_any(starts_with("cate"), ~ .x == "Exploration"),
    cat_explor = as.numeric(replace_na(cat_explor, FALSE)),
    
    cat_deduct = if_any(starts_with("cate"), ~ .x == "Deduction"),
    cat_deduct = as.numeric(replace_na(cat_deduct, FALSE)),
    
    cat_party = if_any(starts_with("cate"), ~ .x == "Party Game"),
    cat_party = as.numeric(replace_na(cat_party, FALSE)),
    
    cat_abstract = if_any(starts_with("cate"), ~ .x == "Abstract Strategy"),
    cat_abstract = as.numeric(replace_na(cat_abstract, FALSE)),
    
    cat_animals = if_any(starts_with("cate"), ~ .x == "Animals"),
    cat_animals = as.numeric(replace_na(cat_animals, FALSE))
  ) %>%
  # flags for mechanic
  step_mutate(
    mech_dice = as.numeric(str_detect(mechanic, "Dice Rolling")),
    mech_hand = as.numeric(str_detect(mechanic, "Hand Management" )),
    mech_powers = as.numeric(str_detect(mechanic, "Variable Player Powers")),
    mech_sets = as.numeric(str_detect(mechanic, "Set Collection")),
    mech_infl = as.numeric(str_detect(mechanic, 
      "Area Control / Area Influence")),
    mech_draft = as.numeric(str_detect(mechanic, "Card Drafting")),
    mech_modular = as.numeric(str_detect(mechanic, "Modular Board")),
    mech_tile = as.numeric(str_detect(mechanic, "Tile Placement")),
    mech_hex = as.numeric(str_detect(mechanic, "Hex-and-Counter")),
    mech_action = as.numeric(str_detect(mechanic, 
      "Action Point Allowance System")),
    mech_coop = as.numeric(str_detect(mechanic, "Co-operative Play")),
    mech_sas = as.numeric(str_detect(mechanic, 
      "Simultaneous Action Selection")),
    mech_auction = as.numeric(str_detect(mechanic, "Auction/Bidding")),
    mech_area = as.numeric(str_detect(mechanic, "Area Movement")),
    mech_worker = as.numeric(str_detect(mechanic, "Worker Placement")),
    mech_grid = as.numeric(str_detect(mechanic, "Grid Movement")),
    mech_simul = as.numeric(str_detect(mechanic, "Simulation")),
    mech_deck = as.numeric(str_detect(mechanic, "Deck / Pool Building")),
    mech_partner = as.numeric(str_detect(mechanic, "Partnerships")),
    mech_point = as.numeric(str_detect(mechanic, "Point to Point Movement")),
    mech_route = as.numeric(str_detect(mechanic, "Route/Network Building"))  
  ) %>%
  # flags for mechanic
  step_mutate(
    design_1 = as.numeric(str_detect(designer, "Reiner Knizia")),
    design_2 = as.numeric(str_detect(designer, "Martin Wallace")),
    design_3 = as.numeric(str_detect(designer, "Wolfgang Kramer")),
    design_4 = as.numeric(str_detect(designer, "Dean Essig")),
    design_5 = as.numeric(str_detect(designer, "Alan R. Moon")),
    design_6 = as.numeric(str_detect(designer, "Bruno Cathala")),
    design_7 = as.numeric(str_detect(designer, "Friedemann Friese")),
    design_8 = as.numeric(str_detect(designer, "Mike Elliott")),
    design_9 = as.numeric(str_detect(designer, "Klaus Teuber")),
    design_10 = as.numeric(str_detect(designer, "Richard H. Berg"))
  ) %>%
  # remove the name, designer, mechanic and category variables
  step_rm(names, mechanic, designer, category1:category12) %>%
  # log transformation
  step_log(num_votes, owned) %>%
  # outliers
  step_mutate(
    min_players = case_when(
      min_players == 0 ~ as.numeric(NA), 
      min_players > 20 ~ 20,
      TRUE ~ min_players
    ),
    max_players = case_when(
      max_players == 0 ~ as.numeric(NA), 
      max_players > 20 ~ 20,
      TRUE ~ max_players
    ),
    min_time = case_when(
      min_time == 0 ~ as.numeric(NA),
      min_time > 240 ~ 240,
      TRUE ~ min_time
    ),
    max_time = case_when(
      max_time == 0 ~ as.numeric(NA),
      max_time > 240 ~ 240,
      TRUE ~ max_time
    ),
    avg_time = case_when(
      avg_time == 0 ~ as.numeric(NA),
      avg_time > 240 ~ 240,
      TRUE ~ avg_time
    ),
    year = case_when(
        year == 0 ~ as.numeric(NA), 
        year < 1900 ~ 1900,
        TRUE ~ year
    )
  ) %>%
  # imputation
  step_impute_mean(min_players, max_players, year, min_time, max_time, avg_time)

Check recipe outputs

Data summary
Name baked
Number of rows 2623
Number of columns 59
_______________________
Column type frequency:
numeric 59
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
game_id 0 1 88298.34 76614.03 2.00 10906.00 69601.00 159125.00 244522.00 ▇▂▂▃▂
min_players 0 1 2.02 0.65 1.00 2.00 2.00 2.00 8.00 ▇▁▁▁▁
max_players 0 1 4.70 2.51 1.00 4.00 4.00 6.00 20.00 ▇▆▁▁▁
avg_time 0 1 80.37 65.29 5.00 30.00 60.00 120.00 240.00 ▇▅▂▁▂
min_time 0 1 67.91 59.52 1.00 30.00 45.00 90.00 240.00 ▇▅▁▁▁
max_time 0 1 80.67 65.14 5.00 30.00 60.00 120.00 240.00 ▇▆▂▁▂
year 0 1 2006.41 13.77 1900.00 2003.00 2010.00 2015.00 2018.00 ▁▁▁▁▇
num_votes 0 1 6.62 1.27 4.14 5.63 6.43 7.40 11.26 ▅▇▅▂▁
age 0 1 10.46 3.20 0.00 8.00 12.00 12.00 42.00 ▃▇▁▁▁
owned 0 1 7.24 1.13 3.89 6.42 7.09 7.92 11.62 ▁▇▇▂▁
geek_rating 0 1 6.09 0.49 5.64 5.73 5.90 6.30 8.50 ▇▂▁▁▁
cat_card_game 0 1 0.28 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
cat_wargame 0 1 0.18 0.38 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
cat_fantasy 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
cat_economic 0 1 0.11 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_fighting 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_science_fiction 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_dice 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_bluffing 0 1 0.06 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_adventure 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_miniatures 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_ww2 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_medieval 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_explor 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_deduct 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_party 0 1 0.05 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_abstract 0 1 0.05 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
cat_animals 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_dice 0 1 0.28 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
mech_hand 0 1 0.27 0.44 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
mech_powers 0 1 0.18 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
mech_sets 0 1 0.15 0.35 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
mech_infl 0 1 0.13 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_draft 0 1 0.12 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_modular 0 1 0.12 0.32 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_tile 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_hex 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_action 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_coop 0 1 0.08 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_sas 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_auction 0 1 0.08 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_area 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_worker 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_grid 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_simul 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_deck 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_partner 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_point 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
mech_route 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_1 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_2 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_3 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_4 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_5 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_6 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_7 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_8 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_9 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
design_10 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Specify workflow

══ Workflow ════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────────────────────
7 Recipe Steps

• step_mutate()
• step_mutate()
• step_mutate()
• step_rm()
• step_log()
• step_mutate()
• step_impute_mean()

── Model ───────────────────────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  trees = 1500
  min_n = tune()

Computational engine: ranger 

Tune

i Creating pre-processing data to finalize unknown parameter: mtry
mtry min_n .metric .estimator mean n std_err .config
35 12 rmse standard 0.1775053 10 0.0030664 Preprocessor1_Model05
26 2 rmse standard 0.1775596 10 0.0030883 Preprocessor1_Model08
55 22 rmse standard 0.1779476 10 0.0033328 Preprocessor1_Model02
31 13 rmse standard 0.1784322 10 0.0031729 Preprocessor1_Model06
44 24 rmse standard 0.1792009 10 0.0032586 Preprocessor1_Model11

Finalise model

══ Workflow ════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────────────────────
7 Recipe Steps

• step_mutate()
• step_mutate()
• step_mutate()
• step_rm()
• step_log()
• step_mutate()
• step_impute_mean()

── Model ───────────────────────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 35
  trees = 1500
  min_n = 12

Computational engine: ranger 

Outcome

While this was a late submission, the RMSE for my submission was 0.17972, which would put me about 10th on the private leaderboard

Possible improvements

  • Try different model types
  • Is there a better way to create flags for the cateogries and mechanics
banner